MODEL COMPUTATION (reference and validation data pre processing)

Library loading, function definition and gene annotation

library(limma)
library(DESeq2)
library(biomaRt)
library(org.Hs.eg.db)
library(readxl)
library(reshape2)
library(ggplot2)
library(ggfortify)
library(ggpubr)
library(caret)
library(steFunctions)
library(EnsDb.Hsapiens.v79)
library(heatmap3)
library(tidyverse)
library(ComplexHeatmap)
library(circlize)
library(doMC)
library(pROC)

#define function to get color gradient 

getGradientColors <- function(colValue, minRange = NULL, maxRange = NULL, ccolors = c('green3', 'red'), bbreak = 0.01){
  library(heatmap3)
  if(is.null(minRange)){
    minRange <- min(colValue, na.rm = T) - 0.001
  }
  if(is.null(maxRange)){
    maxRange <- max(colValue, na.rm = T) + 0.001
  }
  bre <- seq(minRange, maxRange, bbreak) 
  colori <- colByValue(colValue, col = colorRampPalette(ccolors)(length(bre)-1), breaks= bre, cex.axis = 0.8)
  colori <- as.vector(colori)
  graphics.off()
  names(colori) <- names(colValue)
  return(colori)
}

MEDIAsave="/home/mclaudia/MEDIA Project/Codici_finali/"
wdd="/home/mclaudia/MEDIA Project/OAStratification-pipeline/"
MEDIAgraph="/home/mclaudia/MEDIA Project/Codici_finali/Paper_figures/"

load(paste0(MEDIAsave,"genide-bulk-unpaired.RData"),verbose=TRUE) #for TPM trasformation
## Loading objects:
##   OAvsnonOA
##   dds0
##   OAvsnonOAsig
##   OAvsnonOA_dup
##   gn_dup
up_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=1)
dw_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=2)

ensdf <- GenomicFeatures::genes(EnsDb.Hsapiens.v79, return.type="DataFrame")
ens2gene <- as.data.frame(ensdf[,c("gene_id","gene_name")])

We use as reference Dataset 3 (Unpaired)

Data is transformed in TPM usign the gene length provided

Fix duplicates for gene length

load(paste0(wdd,"data/txi.RData"))
patientDetails<-read.csv(paste0(wdd,"data/patientDetails_all_withMed.csv"),stringsAsFactors = F)

tmp <- txi$length[,match(unique(as.character(patientDetails$name)),colnames(txi$length))]
tmp<-merge(tmp,ens2gene,by.x="row.names",by.y="gene_id")

maintain <-OAvsnonOA[grep("^ENSG.+",OAvsnonOA$Row.names),]

maintain_length <-tmp[tmp$Row.names %in% maintain$Row.names,]

tmp <-tmp[!tmp$Row.names %in% maintain$Row.names,]
dupp <-tmp[duplicated(tmp$gene_name),"gene_name"]
gn_dup <-unique(dupp)   #duplicated for filtered genes from the previous analysis


length2 <-as.data.frame(matrix(NA, length(gn_dup),ncol(maintain_length)))
colnames(length2) <-colnames(maintain_length)
rownames(length2) <-paste0("ENS_",1:length(gn_dup))
length2 <-length2[,-c(1,72)]

for(i in 1:length(gn_dup)){
  x=gn_dup[i]
  y=tmp[tmp$gene_name %in% x,-c(1,72)]
  length2[i,1:ncol(length2)] <-apply(y,2,function(x) round(mean(as.numeric(x)),3))
}

length2 <-cbind(Row.names=rownames(length2),length2,gene_name=gn_dup)  #fixed length of duplicated genes from maintained 1190
length2 <-length2[length2$gene_name %in% unique(OAvsnonOA_dup$hgnc_symbol),] #filter for duplicated from previous analysis 440

tmp3 <-rbind(maintain_length,as.matrix(length2))  #combine the two matrices 

gene_length <-tmp3[,2:(ncol(tmp3)-1)]
gene_length <-t(apply(gene_length,1,as.numeric))
rownames(gene_length) <-tmp3$gene_id
colnames(gene_length) <-colnames(tmp3[,2:(ncol(tmp3)-1)])

rownames(gene_length) <-tmp3$gene_name

cc=assay(dds0)

dim(cc)
## [1] 15296    70
dim(gene_length)
## [1] 15291    70

Compute TPM for reference dataset and then log2 trasform

cc <-cc[OAvsnonOA$Row.names,]                      #count matrix
gene_length <-gene_length[OAvsnonOA$hgnc_symbol,] #gene length matrix

identical(rownames(gene_length),as.character(OAvsnonOA$hgnc_symbol))
## [1] TRUE
identical(rownames(cc),as.character(OAvsnonOA$Row.names))
## [1] TRUE
x <- cc/gene_length
tpm.mat <- t((t(x) * 1e6)/colSums(x))
rownames(tpm.mat) <-OAvsnonOA$hgnc_symbol
head(colSums(tpm.mat))
## MRI001 MRI002 MRI003 MRI004 MRI005 MRI006 
##  1e+06  1e+06  1e+06  1e+06  1e+06  1e+06
train_tpm <-log2(tpm.mat+0.001)
boxplot(train_tpm)

Load Validation pre-processed dataset

load(paste0(MEDIAsave,"validationDatasetPreprocessed.RData"),verbose = TRUE)
## Loading objects:
##   dds

Download gene length for TPM trasformation for the Validation dataset

mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")

tmp=assay(dds)
genes <- rownames(tmp)
bmObj <- getBM(attributes = c("hgnc_symbol", "ensembl_gene_id"), 
                      filters = "hgnc_symbol", values = genes, bmHeader = T, mart = mart)
bmObj <-bmObj[order(bmObj$`Gene stable ID`),]
bmObj <-bmObj[!duplicated(bmObj$`HGNC symbol`),]  


ii=intersect(genes,bmObj$`HGNC symbol`)
rownames(bmObj) <-bmObj$`HGNC symbol`

bmObj <-bmObj[ii,]
genes <- genes[genes %in% ii]
identical(genes,bmObj$`HGNC symbol`)
## [1] TRUE
out <- EDASeq::getGeneLengthAndGCContent(id = bmObj$`Gene stable ID`, #using hg19 annotation, download gene length
                                         org = 'hg19',mode = "org.db")

gene_length <-na.omit(out[,1,drop=FALSE])

bmObj <-bmObj[bmObj$`Gene stable ID` %in% rownames(gene_length),]
genes <-genes[genes %in% rownames(bmObj)]

tmp <-tmp[genes,]

identical(rownames(gene_length),bmObj$`Gene stable ID`)
## [1] TRUE
identical(rownames(tmp),bmObj$`HGNC symbol`)
## [1] TRUE
dim(tmp)
## [1] 13916    38
dim(gene_length)
## [1] 13916     1
x <- apply(tmp,2,function(x) x/as.numeric(gene_length[,1]))
tpm.mat2 <- t((t(x) * 1e6)/colSums(x))
head(colSums(tpm.mat2))
## Normal_Cart_10_8  Normal_Cart_2_2  Normal_Cart_3_3  Normal_Cart_4_4 
##            1e+06            1e+06            1e+06            1e+06 
##  Normal_Cart_5_5  Normal_Cart_6_6 
##            1e+06            1e+06
validation_tpm <-log2(tpm.mat2+0.001)

boxplot(validation_tpm)

Rescale validation TPM-log2 data based on reference

1. Intersect reference genes with the ones of the sample of interest (or validation dataset)

2. quantile normalize the reference dataset

ii=intersect(rownames(train_tpm),rownames(validation_tpm)) #select common genes

train_tpm <-train_tpm[ii,]
validation_tpm <-validation_tpm[ii,]

dim(train_tpm)
## [1] 12702    70
dim(validation_tpm)
## [1] 12702    38
tmp <-t(quantileNormalization(t(train_tpm)))

train_tpm <-tmp
boxplot(train_tpm)

list_ranks <- as.numeric(sort(train_tpm[,1],decreasing=TRUE))

head(list_ranks)
## [1] 15.94666 14.86583 14.46304 14.02892 13.77931 13.56793

3. rescale the sample of interest for the quantile interval (or validation dataset)

list_val <-vector("list",ncol(validation_tpm))
names(list_val) <-colnames(validation_tpm)

list_val <-apply(validation_tpm, 2, function(x) as.list(sort(x,decreasing = TRUE)))

list_valRank <-vector("list",ncol(validation_tpm))
names(list_valRank) <-colnames(validation_tpm)


for(i in 1:length(list_val)){
  gg <-names(unlist(list_val[[i]]))
  list_valRank[[i]] <-list_ranks
  names(list_valRank[[i]]) <- gg
}

normdata2 <-matrix(NA,nrow = nrow(validation_tpm), ncol=ncol(validation_tpm))
rownames(normdata2) <-rownames(validation_tpm)
colnames(normdata2) <-colnames(validation_tpm)

for(i in 1:length(list_valRank)){
  tmp <-as.data.frame(list_valRank[[i]])
  rownames(tmp) <-names(list_valRank[[i]])
  normdata2[,i] <-tmp[rownames(normdata2),]
}
boxplot(normdata2)  #Validation re-scaled

boxplot(train_tpm)  #Reference

Save intermediate files

save(validation_tpm,normdata2,train_tpm, file=paste0(MEDIAsave,"reference_validation_tpm_log2_and_qn_fin.RData"))

ELASTIC NET model (select most informative features)

  1. Run the model 100 times (100 x 20-elements samples, 10 random OA and 10 healthy samples)
  2. After, select the features occurred at least the 50% of times
  3. Compute mean mean beta values
  4. Compute scores per patient (linear combination) with the selected features gene expression values
validation_tpm <-normdata2
validation_tpm <-validation_tpm[rownames(validation_tpm) %in% c(up_genes$gene,dw_genes$gene),] #select consensus genes present
dim(validation_tpm)
## [1] 42 38
train_tpm <-train_tpm[rownames(train_tpm) %in% rownames(validation_tpm),]  #quantile reference
dim(train_tpm)
## [1] 42 70
train_tpm_forRidge <- train_tpm
validation_tpm_forRidge <- normdata2

Set 100 OA sample runs: balance data for normal samples (bootstrap)

set.seed(7694)
runs <-vector("list",100)
names(runs) <-paste0("run_",1:100)
runs <-lapply(runs,function(x) x <- sample(11:70, 10, replace = FALSE))

subsets <-vector("list",100)
names(subsets) <-paste0("run_",1:100)

for(i in 1:length(subsets)) subsets[[i]] <-train_tpm[,c(1:10,as.numeric(runs[[i]]))]  #create subsets

Train

Tune the model

trainCtrl.rpcv <- trainControl(method = "LOOCV", number = 1,
                               verboseIter = TRUE, returnData = FALSE,
                               savePredictions = TRUE,        
                               classProbs = TRUE)

ClassBoot=as.factor(c(rep("N",10),rep("OA",10)))

list_tunes <-vector("list",100)
names(list_tunes) <-paste0("tunes_",1:100)

Train the model for 100 runs

nCores <- 250 #change number of threads if needed

registerDoMC(nCores)
for(i in 1:length(subsets)){
  train_dataset <-t(subsets[[i]])
  train <- train(train_dataset,
                           y = ClassBoot,
                           method = "glmnet", 
                           trControl = trainCtrl.rpcv,
                           metric = "Accuracy", allowParallel=TRUE,
                           tuneGrid = expand.grid(alpha = 0.5,  #for elastic net, median between 0 and 1
                                                  lambda = seq(0.001,1,by = 0.001)))
  list_tunes[[i]] <- coef(train$finalModel, train$finalModel$lambdaOpt)
}
save(list_tunes, file=paste0(MEDIAsave,"list_tunes_100_def.RData"))

Assess presence of each feature for each run

table_occurrence <-matrix(0,100,length(rownames(train_tpm)))
colnames(table_occurrence) <-rownames(train_tpm)
rownames(table_occurrence) <-paste0("run_",1:100)

for(i in 1:nrow(table_occurrence)){
  tmp=list_tunes[[i]][,1][-1]
  table_occurrence[i,names(tmp)] <-tmp
  }
table_occurrence[table_occurrence != 0] <-1

tbO <-melt(table_occurrence)

ggplot(tbO, aes(y=Var2, x=Var1, fill=value))+
  geom_tile()+
  theme(axis.text.y = element_text(size=12))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size=8))+
  xlab("Runs")+
  ylab("Genes")+
  scale_fill_continuous(high = "red", low = "blue")

(selected <-sort(colSums(table_occurrence)[colSums(table_occurrence) >= 50],decreasing = TRUE))
## TNFSF11   THBS3    DNER   LOXL3    DYSF   HTRA1    ASPN   TNNT3 
##     100      95      94      94      90      90      87      51

Select features with at least the 50% of occurrence and compute mean Beta value across the runs

selected1 <-names(selected)

meanvalues <-matrix(NA,100,nrow(validation_tpm))
rownames(meanvalues) <-paste0("run_",1:100)
colnames(meanvalues) <-rownames(train_tpm) #42 genes in common between validation an  reference

for(i in 1:nrow(meanvalues)) meanvalues[i,rownames(list_tunes[[i]])[-1]] <-as.numeric(list_tunes[[i]])[-1]

meanvalues <-meanvalues[,selected1]
(cf <-colMeans(meanvalues))
##    TNFSF11      THBS3       DNER      LOXL3       DYSF      HTRA1       ASPN 
## 0.13556325 0.11050480 0.14982991 0.06883307 0.07084835 0.04097398 0.02928284 
##      TNNT3 
## 0.02143482
barplot(sort(cf,decreasing = TRUE), col = "dodgerblue", cex.names=1.2) #plot coefficients

save(cf, file=paste0(MEDIAsave,"loadings_scores_8_elasticNet_fin.RData"))

Computing score (sR) using the beta values: linear combination with the gene expression

Score computed on Reference dataset, exploring also the distribution of this score across the patients

ClassTRfull=as.factor(c(rep("N",10),rep("OA",60)))
score_df <-data.frame(patient=colnames(train_tpm), score=NA, class=ClassTRfull)

train_sc <-t(train_tpm[names(cf),])

sc <-rowSums(t(apply(train_sc,1,function(x) x*cf))) #compute score

rownames(score_df) <-score_df[,1]
score_df <-score_df[names(sc),]
score_df$score <-sc

score_df_train <-score_df
rm(score_df)
colnames(score_df_train)<-c("Patient","Score","Class")

ggplot(score_df_train, aes(x=Score,fill=Class)) +
  geom_density(alpha=.7)+
  xlab("score R")+
  scale_fill_manual(values = c("plum","cyan"))+
  theme_bw()

ggplot(score_df_train, aes(y=Score,x=Class, fill=Class))+
  geom_boxplot(alpha=.6)+
  stat_compare_means(cex=7,label.x=0.75,label.y = 4)+
  geom_jitter((aes(colour = Class))) +
  ylab("score R")+
  scale_fill_manual(values = c("plum","cyan"))+
  scale_color_manual(values = c("deeppink","turquoise"))+
  theme_bw()

How does the score discriminate between healthy and affected patients? REFERENCE

col1 <-getGradientColors(score_df_train$Score, ccolors = c("deepskyblue1","dodgerblue","dodgerblue3","navy"), maxRange = max(score_df_train$Score)+ 0.01, minRange = min(score_df_train$Score)-0.01)

score_df_train$col1 <-col1
score_df_train <-score_df_train[order(score_df_train$Patient),]

score_df_train$col2 <-c(rep("plum",10),rep("cyan",60))

sum(up_genes$gene %in% rownames(train_tpm))
## [1] 37
sum(dw_genes$gene %in% rownames(train_tpm))
## [1] 5
sel <-c(up_genes$gene,dw_genes$gene)[c(up_genes$gene,dw_genes$gene) %in%  rownames(train_tpm)]  #42
train_tpm <-train_tpm[sel,] 

annGenes <-data.frame(genes=sel, col3=c(rep("red",37),rep("blue",5)))

summ=summary(score_df_train$Score)
col_fun = colorRamp2(c(summ[1],summ[2],summ[5],summ[6]), c("#00BFFF","#197CDD","#0F4DB3","#000080"))

score_df_train <-score_df_train[order(score_df_train$Score,decreasing = FALSE),]  #order the patients for increasing Score
head(score_df_train)
##        Patient    Score Class    col1 col2
## MRI009  MRI009 1.368077     N #00BFFF plum
## MRI008  MRI008 1.501484     N #04B7FF plum
## MRI002  MRI002 1.566680     N #06B4FF plum
## MRI006  MRI006 1.691381     N #0AAEFF plum
## MRI004  MRI004 1.718341     N #0BACFF plum
## MRI005  MRI005 1.737940     N #0BACFF plum

Heatmap: z-score computation of the data with patients sorted for increasing score

train_tpm_scal <-t(apply(train_tpm,1,scale))
rownames(train_tpm_scal) <-rownames(train_tpm)
colnames(train_tpm_scal) <-colnames(train_tpm)

train_tpm_scal[train_tpm_scal < -3] <- -3
train_tpm_scal[train_tpm_scal > 3] <- 3

heatmap3(train_tpm_scal[annGenes$genes,rownames(score_df_train)],
         Colv=NA,
         scale="none",
         col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
         RowAxisColors = 1,
         balanceColor = TRUE,
         margins = c(10,10),
         ColSideColors = as.matrix(score_df_train[,c(4,5)]),
         ColSideLabs = c("score R","Status"),
         RowSideColors = annGenes$col3,
         RowSideLabs = "Consensus",
         cexRow = 1, cexCol = 1)
legend(0.4,0.9,legend=c("OA","Normal"),cex=1.5,
       fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.5,0.9,legend=c("UP","DOWN"),cex=1.5,
       fill=c("red","blue"), bty = "n", title = "DE")
draw(Legend(col_fun = col_fun, title = "score R"), x = unit(14, "in"), y = unit(8, "in"))

Score computed on Validation dataset, exploring also the distribution of this score across the patients

ClassV=factor(c(rep("N",18),rep("OA",20)))
score_df <-data.frame(patient=colnames(validation_tpm), score=NA, class=ClassV)

val_sc <-t(validation_tpm[names(cf),])
identical(names(cf), colnames(val_sc)) #TRUE
## [1] TRUE
sc <-rowSums(t(apply(val_sc,1,function(x) x*cf)))

rownames(score_df) <-score_df[,1]
score_df <-score_df[names(sc),]
score_df$score <-sc
head(sc)
## Normal_Cart_10_8  Normal_Cart_2_2  Normal_Cart_3_3  Normal_Cart_4_4 
##         2.722755         3.005583         2.078916         3.007800 
##  Normal_Cart_5_5  Normal_Cart_6_6 
##         1.948965         2.135283
score_df_val <-score_df
rm(score_df)
colnames(score_df_val)<-c("Patient","Score","Class")

ggplot(score_df_val, aes(x=Score,fill=Class)) +
  geom_density(alpha=.7)+
  xlab("score R")+
  scale_fill_manual(values = c("plum","cyan"))+
  theme_bw()

ggplot(score_df_val, aes(y=Score,x=Class, fill=Class))+
  geom_boxplot(alpha=.6)+
  stat_compare_means(cex=7,label.x=0.75,label.y = 4)+
  geom_jitter((aes(colour = Class))) +
  ylab("score R")+
  scale_fill_manual(values = c("plum","cyan"))+
  scale_color_manual(values = c("deeppink","turquoise"))+
  theme_bw()

How does the score discriminate between healthy and affected patients? VALIDATION

col1 <-getGradientColors(score_df_val$Score, ccolors = c("deepskyblue1","dodgerblue","dodgerblue3","navy"), maxRange = max(score_df_val$Score)+ 0.01, minRange = min(score_df_val$Score)-0.01)
score_df_val$col1 <-col1
score_df_val <-score_df_val[order(score_df_val$Patient),]

score_df_val$col2 <-c(rep("plum",18),rep("cyan",20))

sum(up_genes$gene %in% rownames(validation_tpm))
## [1] 37
sum(dw_genes$gene %in% rownames(validation_tpm))
## [1] 5
validation_tpm <-validation_tpm[sel,] 
summ2=summary(score_df_val$Score)
col_fun2 = colorRamp2(c(summ2[1],summ2[2],summ2[5],summ2[6]), c("#00BFFF","#1C88F2","#0D40AB","#000080"))

score_df_val <-score_df_val[order(score_df_val$Score,decreasing = FALSE),] #order the patients for increasing Score
head(score_df_val)
##                         Patient     Score Class    col1 col2
## normal_06             normal_06 0.7715032     N #00BFFF plum
## normal_04             normal_04 1.3818032     N #0FA7FF plum
## normal_02             normal_02 1.3919973     N #0FA6FF plum
## normal_10             normal_10 1.5136887     N #12A2FF plum
## Normal_Cart_5_5 Normal_Cart_5_5 1.9489655     N #1D91FF plum
## normal_07             normal_07 2.0107593     N #1D8FFE plum

Heatmap: z-score computation of the data with patients sorted for increasing score

validation_tpm_scal <-t(apply(validation_tpm,1,scale))
rownames(validation_tpm_scal) <-rownames(validation_tpm)
colnames(validation_tpm_scal) <-colnames(validation_tpm)

validation_tpm_scal[validation_tpm_scal < -3] <- -3
validation_tpm_scal[validation_tpm_scal > 3] <- 3

heatmap3(validation_tpm_scal[annGenes$genes,rownames(score_df_val)],
         Colv=NA,
         scale="none",
         col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
         RowAxisColors = 1,
         balanceColor = TRUE,
         margins = c(10,10),
         ColSideColors = as.matrix(score_df_val[,c(4,5)]),
         ColSideLabs = c("score R","Status"),
         RowSideColors = annGenes$col3,
         RowSideLabs = "Consensus",
         cexRow = 1, cexCol = 1)
legend(0.4,0.9,legend=c("OA","Normal"),cex=1.5,
       fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.5,0.9,legend=c("UP","DOWN"),cex=1.5,
       fill=c("red","blue"), bty = "n", title = "DE")
draw(Legend(col_fun = col_fun, title = "score R"), x = unit(14, "in"), y = unit(8, "in"))

Performance analysis: ROC curve on Validation dataset for the ElasticNet model (sR score)

ClassV=as.factor(c(rep("N",18),rep("OA",20)))

score_df_val <-score_df_val[order(score_df_val$Class),]
rocobj <- roc(ClassV,score_df_val$Score) 

auc <-round(rocobj$auc,3)

ggroc(rocobj, size = 2, col="dodgerblue",legacy.axes = TRUE) +
  theme_bw()+
  ggtitle(paste0('ROC Curve for score R on Validation dataset ', '(AUC = ', auc, ')'))+
  labs(x = "1 - Specificity",y = "Sensitivity")

rocobj_val <-rocobj

RIDGE model (all the features available)

##Train ### Tune the model

validation_tpm <-validation_tpm_forRidge[rownames(validation_tpm_forRidge) %in% c(up_genes$gene,dw_genes$gene),]
train_tpm <-train_tpm_forRidge[rownames(train_tpm_forRidge) %in% rownames(validation_tpm),]  


ClassTRfull=as.factor(c(rep("N",10),rep("OA",60)))
trainCtrl.rpcv <- trainControl(method = "LOOCV", number = 1,
                               verboseIter = TRUE, returnData = FALSE,
                               savePredictions = TRUE,        
                               classProbs = TRUE)

Train the model and save the beta coefficient

nCores <- 250 #change number if needed
registerDoMC(nCores)

set.seed(3744)
train_dataset <-t(train_tpm)
train <- train(train_dataset,
                 y = ClassTRfull,
                 method = "glmnet", 
                 trControl = trainCtrl.rpcv,
                 metric = "Accuracy", allowParallel=TRUE,
                 tuneGrid = expand.grid(alpha = 0, #ridge
                                        lambda = seq(0.001,1,by = 0.001)))
cf_all <-coef(train$finalModel, train$finalModel$lambdaOpt)[,1][-1]

save(cf_all, file=paste0(MEDIAsave,"loadings_scores_all_ridge_fin.RData"))
barplot(sort(cf_all,decreasing = TRUE), col = "dodgerblue",cex.names = 0.6) #plot coefficients

Compute score for Reference dataset using beta values from the Ridge model

score_df_all_tr <-data.frame(patient=colnames(train_tpm), score=NA, class=ClassTRfull)

train_sc_all <-t(train_tpm[names(cf_all),])

sc1 <-rowSums(t(apply(train_sc_all,1,function(x) x*cf_all))) 

rownames(score_df_all_tr) <-score_df_all_tr[,1]
score_df_all_tr <-score_df_all_tr[names(sc1),]
score_df_all_tr$score <-sc1
head(sc1)
##   MRI001   MRI002   MRI003   MRI004   MRI005   MRI006 
## 7.298994 3.104174 5.247277 4.097568 5.172805 4.335652

Compute score for Validation dataset using beta values from the Ridge model

score_df_all_val <-data.frame(patient=colnames(validation_tpm), score=NA, class=ClassV)

val_sc_all <-t(validation_tpm[names(cf_all),])
identical(names(cf_all), colnames(val_sc_all)) #TRUE
## [1] TRUE
sc <-rowSums(t(apply(val_sc_all,1,function(x) x*cf_all))) 

rownames(score_df_all_val) <-score_df_all_val[,1]
score_df_all_val <-score_df_all_val[names(sc),]
score_df_all_val$score <-sc

Performance analysis: ROC curve on Validation dataset - RIDGE

score_df_all_val <-score_df_all_val[order(score_df_all_val$score, decreasing=FALSE),]

rocobj_val_all<- roc(score_df_all_val$class,score_df_all_val$score) 

auc <-round(rocobj_val_all$auc,3)

ggroc(rocobj_val_all, size = 2, col="dodgerblue", legacy.axes = TRUE) +
  theme_bw()+
  ggtitle(paste0('ROC Curve for score T on Validation dataset ', '(AUC = ', auc, ')'))+
  labs(x = "1 - Specificity",y = "Sensitivity")

## Compare the ROC curves to assess the efficacy of the model

list_rocs <-list(rocobj_val,rocobj_val_all)
names(list_rocs) <-c("Score R - 8 features","Score T - 42 features")

auc <- lapply(list_rocs, function(x) round(x$auc,3))

auc[1]
## $`Score R - 8 features`
## [1] 0.883
auc[2]
## $`Score T - 42 features`
## [1] 0.922
rocts <-roc.test(rocobj_val,rocobj_val_all)
rocts
## 
##  DeLong's test for two ROC curves
## 
## data:  rocobj_val and rocobj_val_all
## D = -0.55529, df = 70.778, p-value = 0.5804
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.8833333   0.9222222
ggroc(list_rocs, size = 2,legacy.axes = TRUE) +
    theme_bw()+
    annotate("text", 0.75, 0.44,  label=expression("AUC 8 features - score s"["R"] : 0.883) ,size=7)+
    annotate("text", 0.75,0.4, label=expression("AUC 42 features - score s"["T"] : 0.922) ,size=7)+
    labs(x = "1 - Specificity",y = "Sensitivity",color='Models')+
    theme(legend.title=element_text(size=16),legend.text=element_text(size=14))+
       guides(Models = guide_legend(override.aes = list(size = 4)))

Save resulting scores for the two models and the two datasets

list_scores <-list(score_df_all_tr, score_df_all_val, score_df_train, score_df_val)
names(list_scores) <-c("scoreT Training","scoreT Valid","scoreR Training","scoreR Valid")
save(list_scores,file=paste0(MEDIAsave,"list_scores.RData"))